%load_ext autoreload
%autoreload 2
import logging
logging.basicConfig(level=logging.DEBUG)

import polars as pl
pl.enable_string_cache() # TODO: remove eventually with proper enums
from zipfile import ZipFile
from pathlib import Path
from polars import col as C
import plotly.express as px
from ctrace.constants import *
import ctrace as ct
import tempfile

_logger = logging.getLogger(__name__)
# The zip files released by climate trace

_files =  [ 
        "agriculture.zip",
        "buildings.zip",
        "fluorinated_gases.zip", 
        "forestry_and_land_use.zip",
        "fossil_fuel_operations.zip", 
        "manufacturing.zip",
        "mineral_extraction.zip",
         "power.zip",
         "transportation.zip",
        "waste.zip"
            ]

def _recast_parquet(df) -> pl.DataFrame:
    cf_cols = [SOURCE_TYPE, CAPACITY, CAPACITY_FACTOR, ACTIVITY, CO2_EMISSIONS_FACTOR,
         CH4_EMISSIONS_FACTOR, N2O_EMISSIONS_FACTOR, CO2_EMISSIONS, CH4_EMISSIONS, 
        N2O_EMISSIONS, TOTAL_CO2E_20YRGWP, TOTAL_CO2E_100YRGWP]
    return (df.with_columns(
                c_iso3_country.cast(ct.iso3_enum).alias(ISO3_COUNTRY),
                c_gas.cast(ct.gas_enum, strict=False).alias(GAS),
                # TODO: change eventually to enum
                c_original_inventory_sector.cast(pl.String).alias(ORIGINAL_INVENTORY_SECTOR),
                c_temporal_granularity.cast(ct.temporal_granularity_enum).alias(TEMPORAL_GRANULARITY),
            ).with_columns(
        *[C("conf_" + col_name).cast(pl.Categorical, strict=False).alias("conf_" + col_name) for col_name in cf_cols]
            ))

def _load_sources(fp) -> pl.DataFrame:
    dates = [
        START_TIME, END_TIME, CREATED_DATE, MODIFIED_DATE
    ]
    uint64s = [SOURCE_ID]
    floats = [EMISSIONS_QUANTITY, EMISSIONS_FACTOR, CAPACITY, CAPACITY_FACTOR, ACTIVITY, LAT, LON]
    enums = {
        ISO3_COUNTRY:ct.iso3_enum, 
        GAS:ct.gas_enum, 
        TEMPORAL_GRANULARITY:ct.temporal_granularity_enum,
        # TODO: activate once the whole list is gathered
        # ORIGINAL_INVENTORY_SECTOR:ct.inventory_sector_enum
    }
    # TODO: make it lazy
    df = pl.read_csv(fp.read(), infer_schema_length=0) #.lazy()
    num_other = 12
    check_cols = [ACTIVITY, ACTIVITY_UNITS, EMISSIONS_FACTOR, EMISSIONS_FACTOR_UNITS, 
                  CAPACITY_UNITS, CAPACITY, CAPACITY_FACTOR, GEOMETRY_REF, LAT, LON, ORIGINAL_INVENTORY_SECTOR] + [f"other{i}" for i in range(1,num_other+1)] + [f"other{i}_def" for i in range(1,num_other+1)]
    for col_name in check_cols:
        if col_name not in df.columns:
            df = df.with_columns(pl.lit(None).cast(pl.String, strict=False).alias(col_name))
    # Check that the columns match exactly
    s1 = set(df.columns)
    s2 = set(all_columns)
    assert s1 == s2, (s1-s2, s2-s1, list(zip(sorted(df.columns), sorted(all_columns))))
    df = df.select(*all_columns)
    return (df.with_columns(
        # Only start_time and end_time are required
    *[C(col_name).str.to_datetime(strict=(col_name in {START_TIME, END_TIME})).alias(col_name) for col_name in dates])
            .with_columns(
                c_iso3_country.cast(ct.iso3_enum).alias(ISO3_COUNTRY),
                c_gas.cast(ct.gas_enum).alias(GAS),
                c_temporal_granularity.cast(ct.temporal_granularity_enum).alias("temporal_granularity"),
                # TODO: change eventually to enum
                # c_original_inventory_sector.cast(pl.Categorical).alias(ORIGINAL_INVENTORY_SECTOR)
    ).with_columns(*[C(col_name).cast(pl.Float64, strict=False).alias(col_name) for col_name in floats])
            .with_columns(*[C(col_name).cast(pl.UInt64).alias(col_name) for col_name in uint64s])
            # This is for debugging the memory consumption, remove eventually
            .limit(1_000_000_000)
    )

def _load_source_confidence(fp) -> pl.DataFrame:
    dates = [
        START_TIME, END_TIME, CREATED_DATE, MODIFIED_DATE
    ]
    cf_cols = [SOURCE_TYPE, CAPACITY, CAPACITY_FACTOR, ACTIVITY, CO2_EMISSIONS_FACTOR,
         CH4_EMISSIONS_FACTOR, N2O_EMISSIONS_FACTOR, CO2_EMISSIONS, CH4_EMISSIONS, 
        N2O_EMISSIONS, TOTAL_CO2E_20YRGWP, TOTAL_CO2E_100YRGWP]
    uint64s = [SOURCE_ID]
    # TODO: make it lazy
    df = pl.read_csv(fp.read(), infer_schema_length=0) #.lazy()
    sels = ([C(col_name).str.to_datetime(strict=(col_name in {START_TIME, END_TIME})).alias(col_name) for col_name in dates]
           + [c_iso3_country.cast(ct.iso3_enum).alias(ISO3_COUNTRY), c_source_id.cast(pl.UInt64).alias(SOURCE_ID)]
           + [C(col_name).cast(pl.Categorical, strict=False).alias("conf_"+col_name) for col_name in cf_cols]
           )
    df = df.select(*sels)
    # For debugging
    return df #.limit(1_000)

def _load_source_conf(s_fp, c_fp) -> pl.DataFrame:
    s_df = _load_sources(s_fp)
    c_df = _load_source_confidence(c_fp)
    return (s_df.join(c_df.drop(["created_date", "modified_date"]),
                      on=["start_time", "end_time", "iso3_country", "source_id"], 
                      how="left")
      )

def read_all_sources(p: Path) -> pl.DataFrame:
    res_df = None
    # dfs = []
    # TODO: replace by a proper temp directory
    tmp_dir = Path(tempfile.gettempdir())
    data_files = []
    for fname in _files:
        _logger.debug(f"open {fname}")
        zf = ZipFile(Path(ct_data_source) / fname)
        source_names = [n for n in zf.namelist() if n.endswith("-sources.csv")]
        _logger.debug(f"sources: {source_names}")
        for sname in source_names:
            _logger.debug(f"opening {fname} / {sname}")
            c_name = sname.replace("_emissions-sources.csv", "_emissions-sources_confidence.csv")
            _logger.debug(f"opening {fname} / {c_name}")
            df = _load_source_conf(zf.open(sname), zf.open(c_name))
            df = (df.with_columns(
                pl.lit(fname.replace(".zip", "")).cast(pl.Categorical).alias("ct_package"),
                pl.lit(sname.replace("_emissions-sources.csv", "")).cast(pl.Categorical).alias("ct_file")))
            tmp_name = tmp_dir / sname.replace(".csv",".parquet")
            df.write_parquet(tmp_name)
            data_files.append(tmp_name)
            _logger.debug(f"wrote {tmp_name}")
    dfs = []
    for tmp_name in data_files:
        _logger.debug(f"scan {tmp_name}")
        df = pl.scan_parquet(tmp_name)
        df = df.pipe(_recast_parquet)
        dfs.append(df)
    res_df = pl.concat(dfs)
    return res_df

def _load_country_emissions(fp) -> pl.DataFrame:
    dates = [
        START_TIME, END_TIME, CREATED_DATE, MODIFIED_DATE
    ]
    floats = [EMISSIONS_QUANTITY]
    enums = {
        ISO3_COUNTRY:ct.iso3_enum, 
        GAS:ct.gas_enum, 
        TEMPORAL_GRANULARITY:ct.temporal_granularity_enum,
        # TODO: activate once the whole list is gathered
        # ORIGINAL_INVENTORY_SECTOR:ct.inventory_sector_enum
    }
    df = pl.read_csv(fp.read(), infer_schema_length=0)
    all_columns = ['iso3_country',
             'start_time',
             'end_time',
             'original_inventory_sector',
             'gas',
             'emissions_quantity',
             'emissions_quantity_units',
             'temporal_granularity',
             'created_date',
             'modified_date']
    # Check that the columns match exactly
    s1 = set(df.columns)
    s2 = set(all_columns)
    assert s1 == s2, (s1-s2, s2-s1, list(zip(sorted(df.columns), sorted(all_columns))))
    df = df.select(*all_columns)
    return (df.with_columns(
        # Only start_time and end_time are required
    *[C(col_name).str.to_datetime(format="%Y-%m-%d %H:%M:%S%.f",
                                  strict=(col_name in {START_TIME, END_TIME})).alias(col_name) for col_name in dates])
            .with_columns(
                c_iso3_country.cast(ct.iso3_enum).alias(ISO3_COUNTRY),
                c_gas.cast(ct.gas_enum, strict=False).alias(GAS),
                # TODO: causing issues
                # c_original_inventory_sector.cast(pl.Categorical).alias(ORIGINAL_INVENTORY_SECTOR),
                c_temporal_granularity.cast(ct.temporal_granularity_enum).alias(TEMPORAL_GRANULARITY)
    ).with_columns(*[C(col_name).cast(pl.Float64, strict=False).alias(col_name) for col_name in floats])
    )


def read_country_emissions(p: Path) -> pl.DataFrame:
    # TODO: add meta info about the provenance: ct_package and ct_file
    # So fast there is no need to store a materialized version.
    dfs = []
    for fname in _files:
        _logger.debug(f"open {fname}")
        zf = ZipFile(Path(ct_data_source) / fname)
        source_names = [n for n in zf.namelist() if n.endswith("_country_emissions.csv")]
        _logger.debug(f"sources: {source_names}")
        for sname in source_names:
            _logger.debug(f"opening {fname} / {sname}")
            df = _load_country_emissions(zf.open(sname))
            df = (df.with_columns(
                pl.lit(fname.replace(".zip", "")).cast(pl.Categorical).alias(CT_PACKAGE),
                pl.lit(sname.replace("_country_emissions.csv", "")).cast(pl.Categorical).alias(CT_FILE)))
            dfs.append(df)
    res_df = pl.concat(dfs)
    return res_df
    

ct_data_source = "/home/tjhunter/Downloads/"
cedf = read_country_emissions(Path(ct_data_source))

if False:
    df = read_all_sources(Path(ct_data_source))
    # Using snappy because it is more broadly compatible with parquet readers
    # Polars 0.20 does not support statistics
    for year in range(2015, 2023):
        df.filter(c_start_time.dt.year()==year).sink_parquet(
            f"/home/tjhunter/Downloads/ct_data_{year}.parquet",
                    compression="snappy",
                    statistics=False
                   )
DEBUG:__main__:open agriculture.zip
DEBUG:__main__:sources: ['enteric-fermentation-other_country_emissions.csv', 'other-agricultural-soil-emissions_country_emissions.csv', 'manure-management-other_country_emissions.csv', 'rice-cultivation_country_emissions.csv', 'enteric-fermentation-cattle-feedlot_country_emissions.csv', 'manure-management-cattle-feedlot_country_emissions.csv', 'synthetic-fertilizer-application_country_emissions.csv', 'cropland-fires_country_emissions.csv', 'enteric-fermentation-cattle-pasture_country_emissions.csv', 'manure-left-on-pasture-cattle_country_emissions.csv']
DEBUG:__main__:opening agriculture.zip / enteric-fermentation-other_country_emissions.csv
DEBUG:__main__:opening agriculture.zip / other-agricultural-soil-emissions_country_emissions.csv
DEBUG:__main__:opening agriculture.zip / manure-management-other_country_emissions.csv
DEBUG:__main__:opening agriculture.zip / rice-cultivation_country_emissions.csv
DEBUG:__main__:opening agriculture.zip / enteric-fermentation-cattle-feedlot_country_emissions.csv
DEBUG:__main__:opening agriculture.zip / manure-management-cattle-feedlot_country_emissions.csv
DEBUG:__main__:opening agriculture.zip / synthetic-fertilizer-application_country_emissions.csv
DEBUG:__main__:opening agriculture.zip / cropland-fires_country_emissions.csv
DEBUG:__main__:opening agriculture.zip / enteric-fermentation-cattle-pasture_country_emissions.csv
DEBUG:__main__:opening agriculture.zip / manure-left-on-pasture-cattle_country_emissions.csv
DEBUG:__main__:open buildings.zip
DEBUG:__main__:sources: ['other-onsite-fuel-usage_country_emissions.csv', 'residential-and-commercial-onsite-fuel-usage_country_emissions.csv']
DEBUG:__main__:opening buildings.zip / other-onsite-fuel-usage_country_emissions.csv
DEBUG:__main__:opening buildings.zip / residential-and-commercial-onsite-fuel-usage_country_emissions.csv
DEBUG:__main__:open fluorinated_gases.zip
DEBUG:__main__:sources: ['fluorinated-gases_country_emissions.csv']
DEBUG:__main__:opening fluorinated_gases.zip / fluorinated-gases_country_emissions.csv
DEBUG:__main__:open forestry_and_land_use.zip
DEBUG:__main__:sources: ['water-reservoirs_country_emissions.csv', 'forest-land-degradation_country_emissions.csv', 'shrubgrass-fires_country_emissions.csv', 'wetland-fires_country_emissions.csv', 'net-shrubgrass_country_emissions.csv', 'forest-land-fires_country_emissions.csv', 'removals_country_emissions.csv', 'net-wetland_country_emissions.csv', 'net-forest-land_country_emissions.csv', 'forest-land-clearing_country_emissions.csv']
DEBUG:__main__:opening forestry_and_land_use.zip / water-reservoirs_country_emissions.csv
DEBUG:__main__:opening forestry_and_land_use.zip / forest-land-degradation_country_emissions.csv
DEBUG:__main__:opening forestry_and_land_use.zip / shrubgrass-fires_country_emissions.csv
DEBUG:__main__:opening forestry_and_land_use.zip / wetland-fires_country_emissions.csv
DEBUG:__main__:opening forestry_and_land_use.zip / net-shrubgrass_country_emissions.csv
DEBUG:__main__:opening forestry_and_land_use.zip / forest-land-fires_country_emissions.csv
DEBUG:__main__:opening forestry_and_land_use.zip / removals_country_emissions.csv
DEBUG:__main__:opening forestry_and_land_use.zip / net-wetland_country_emissions.csv
DEBUG:__main__:opening forestry_and_land_use.zip / net-forest-land_country_emissions.csv
DEBUG:__main__:opening forestry_and_land_use.zip / forest-land-clearing_country_emissions.csv
DEBUG:__main__:open fossil_fuel_operations.zip
DEBUG:__main__:sources: ['other-fossil-fuel-operations_country_emissions.csv', 'solid-fuel-transformation_country_emissions.csv', 'oil-and-gas-refining_country_emissions.csv', 'coal-mining_country_emissions.csv', 'oil-and-gas-production-and-transport_country_emissions.csv']
DEBUG:__main__:opening fossil_fuel_operations.zip / other-fossil-fuel-operations_country_emissions.csv
DEBUG:__main__:opening fossil_fuel_operations.zip / solid-fuel-transformation_country_emissions.csv
DEBUG:__main__:opening fossil_fuel_operations.zip / oil-and-gas-refining_country_emissions.csv
DEBUG:__main__:opening fossil_fuel_operations.zip / coal-mining_country_emissions.csv
DEBUG:__main__:opening fossil_fuel_operations.zip / oil-and-gas-production-and-transport_country_emissions.csv
DEBUG:__main__:open manufacturing.zip
DEBUG:__main__:sources: ['petrochemicals_country_emissions.csv', 'other-manufacturing_country_emissions.csv', 'pulp-and-paper_country_emissions.csv', 'chemicals_country_emissions.csv', 'aluminum_country_emissions.csv', 'steel_country_emissions.csv', 'cement_country_emissions.csv']
DEBUG:__main__:opening manufacturing.zip / petrochemicals_country_emissions.csv
DEBUG:__main__:opening manufacturing.zip / other-manufacturing_country_emissions.csv
DEBUG:__main__:opening manufacturing.zip / pulp-and-paper_country_emissions.csv
DEBUG:__main__:opening manufacturing.zip / chemicals_country_emissions.csv
DEBUG:__main__:opening manufacturing.zip / aluminum_country_emissions.csv
DEBUG:__main__:opening manufacturing.zip / steel_country_emissions.csv
DEBUG:__main__:opening manufacturing.zip / cement_country_emissions.csv
DEBUG:__main__:open mineral_extraction.zip
DEBUG:__main__:sources: ['rock-quarrying_country_emissions.csv', 'sand-quarrying_country_emissions.csv', 'copper-mining_country_emissions.csv', 'iron-mining_country_emissions.csv', 'bauxite-mining_country_emissions.csv']
DEBUG:__main__:opening mineral_extraction.zip / rock-quarrying_country_emissions.csv
DEBUG:__main__:opening mineral_extraction.zip / sand-quarrying_country_emissions.csv
DEBUG:__main__:opening mineral_extraction.zip / copper-mining_country_emissions.csv
DEBUG:__main__:opening mineral_extraction.zip / iron-mining_country_emissions.csv
DEBUG:__main__:opening mineral_extraction.zip / bauxite-mining_country_emissions.csv
DEBUG:__main__:open power.zip
DEBUG:__main__:sources: ['other-energy-use_country_emissions.csv', 'electricity-generation_country_emissions.csv']
DEBUG:__main__:opening power.zip / other-energy-use_country_emissions.csv
DEBUG:__main__:opening power.zip / electricity-generation_country_emissions.csv
DEBUG:__main__:open transportation.zip
DEBUG:__main__:sources: ['other-transport_country_emissions.csv', 'railways_country_emissions.csv', 'road-transportation_country_emissions.csv', 'international-aviation_country_emissions.csv', 'domestic-aviation_country_emissions.csv', 'international-shipping_country_emissions.csv', 'domestic-shipping_country_emissions.csv']
DEBUG:__main__:opening transportation.zip / other-transport_country_emissions.csv
DEBUG:__main__:opening transportation.zip / railways_country_emissions.csv
DEBUG:__main__:opening transportation.zip / road-transportation_country_emissions.csv
DEBUG:__main__:opening transportation.zip / international-aviation_country_emissions.csv
DEBUG:__main__:opening transportation.zip / domestic-aviation_country_emissions.csv
DEBUG:__main__:opening transportation.zip / international-shipping_country_emissions.csv
DEBUG:__main__:opening transportation.zip / domestic-shipping_country_emissions.csv
DEBUG:__main__:open waste.zip
DEBUG:__main__:sources: ['incineration-and-open-burning-of-waste_country_emissions.csv', 'biological-treatment-of-solid-waste-and-biogenic_country_emissions.csv', 'solid-waste-disposal_country_emissions.csv', 'wastewater-treatment-and-discharge_country_emissions.csv']
DEBUG:__main__:opening waste.zip / incineration-and-open-burning-of-waste_country_emissions.csv
DEBUG:__main__:opening waste.zip / biological-treatment-of-solid-waste-and-biogenic_country_emissions.csv
DEBUG:__main__:opening waste.zip / solid-waste-disposal_country_emissions.csv
DEBUG:__main__:opening waste.zip / wastewater-treatment-and-discharge_country_emissions.csv
cedf
shape: (512_110, 12)
iso3_countrystart_timeend_timeoriginal_inventory_sectorgasemissions_quantityemissions_quantity_unitstemporal_granularitycreated_datemodified_datect_packagect_file
enumdatetime[ns]datetime[ns]strenumf64strenumdatetime[ns]datetime[ns]catcat
"AUT"2022-01-01 00:00:002022-12-31 00:00:00"enteric-fermen…"n2o"0.0"tonnes"null2023-10-23 11:43:08.395355null"agriculture""enteric-fermen…
"AZE"2022-01-01 00:00:002022-12-31 00:00:00"enteric-fermen…"co2"0.0"tonnes"null2023-10-23 11:43:08.395355null"agriculture""enteric-fermen…
"ABW"2022-01-01 00:00:002022-12-31 00:00:00"enteric-fermen…"co2"0.0"tonnes"null2023-10-23 11:43:08.395355null"agriculture""enteric-fermen…
"ABW"2022-01-01 00:00:002022-12-31 00:00:00"enteric-fermen…"n2o"0.0"tonnes"null2023-10-23 11:43:08.395355null"agriculture""enteric-fermen…
"ABW"2022-01-01 00:00:002022-12-31 00:00:00"enteric-fermen…"ch4"0.0"tonnes"null2023-10-23 11:43:08.395355null"agriculture""enteric-fermen…
"TUV"2016-01-01 00:00:002016-12-31 00:00:00"wastewater-tre…"co2e_100yr"2516.497358"tonnes""annual"2023-10-06 14:56:33.4283282023-11-03 15:13:27.777139"waste""wastewater-tre…
"UGA"2016-01-01 00:00:002016-12-31 00:00:00"wastewater-tre…"co2e_20yr"1.6999e7"tonnes""annual"2023-10-06 14:56:33.4283282023-11-03 15:13:28.159334"waste""wastewater-tre…
"UGA"2015-01-01 00:00:002015-12-31 00:00:00"wastewater-tre…"co2e_20yr"1.6456e7"tonnes""annual"2023-10-06 14:56:33.4283282023-11-03 15:13:34.328448"waste""wastewater-tre…
"ZWE"2015-01-01 00:00:002015-12-31 00:00:00"wastewater-tre…"co2e_20yr"6.1586e6"tonnes""annual"2023-10-06 14:56:33.4283282023-11-03 15:13:34.624499"waste""wastewater-tre…
"ZWE"2015-01-01 00:00:002015-12-31 00:00:00"wastewater-tre…"co2e_100yr"2.1487e6"tonnes""annual"2023-10-06 14:56:33.4283282023-11-03 15:13:34.624499"waste""wastewater-tre…

Country checks#

gas = "co2e_100yr"
year = 2022

cedf_gy = cedf.filter(c_gas == gas).filter(c_start_time.dt.year()==year)
((cedf_gy
 .filter(c_emissions_quantity > 0) # I think this is the most relevant, but to check
)[EMISSIONS_QUANTITY].sum(),
(cedf_gy
)[EMISSIONS_QUANTITY].sum())
(76430900242.65804, 55171507719.83738)
px.bar(cedf_gy
    .filter(c_emissions_quantity > 0)
    .group_by(c_iso3_country)
    .agg(c_emissions_quantity.sum())
    .sort([c_emissions_quantity], descending=True)
    .head(20)
,x=ISO3_COUNTRY, y=EMISSIONS_QUANTITY,log_y=True)

# Mozambique emits more than Germany??
px.bar(cedf_gy
    .group_by(c_iso3_country, c_original_inventory_sector)
    .agg(c_emissions_quantity.sum())
    .sort([c_emissions_quantity], descending=True)
    .filter(c_iso3_country.is_in([
        "CHN", "USA", "IND", "RUS",
        # "MOZ",
        "FRA", "NLD"]))
,x=ISO3_COUNTRY, y=EMISSIONS_QUANTITY,color=ORIGINAL_INVENTORY_SECTOR,log_y=False)

# Chinese forests remove more than all of france emissions
px.bar(cedf_gy
    .group_by(c_original_inventory_sector)
    .agg(c_emissions_quantity.sum())
    .sort([c_emissions_quantity], descending=True)
    .head(10)
,x=ORIGINAL_INVENTORY_SECTOR, y=EMISSIONS_QUANTITY,log_y=False)
px.bar(cedf_gy
    .group_by(c_original_inventory_sector)
    .agg(c_emissions_quantity.sum())
    .sort([c_emissions_quantity], descending=True)
    .tail(5)
,x=ORIGINAL_INVENTORY_SECTOR, y=EMISSIONS_QUANTITY,log_y=False)

Source analysis#

# Parquet loses enum structures.
sdf = (pl.scan_parquet(f"/home/tjhunter/Downloads/ct_data_{year}.parquet").pipe(_recast_parquet)
      )
sdf_gy = sdf.filter(c_gas == gas) #.collect()
sdf.select((c_start_time.dt.year())).min().collect() #.group_by("conf_total_co2e_100yrgwp").count().collect()
shape: (1, 1)
start_time
i32
2022

Number of distinct sources for last year

sdf_gy.select(c_source_id.unique_counts()).count().collect().item()
627582
(sdf.filter(c_original_inventory_sector.is_null())
 .group_by(c_start_time.dt.year(), C("ct_package"), C("ct_file"))
 .agg(pl.count())
 .collect())
/tmp/ipykernel_1880515/3687814498.py:3: DeprecationWarning:

`pl.count()` is deprecated. Please use `pl.len()` instead.
shape: (9, 4)
start_timect_packagect_filecount
i32catcatu32
2022"forestry_and_l…"wetland-fires"32730
2022"forestry_and_l…"forest-land-cl…211315
2022"forestry_and_l…"shrubgrass-fir…93935
2022"forestry_and_l…"net-wetland"213760
2022"forestry_and_l…"net-shrubgrass…248045
2022"forestry_and_l…"forest-land-fi…81120
2022"forestry_and_l…"net-forest-lan…248300
2022"forestry_and_l…"forest-land-de…94420
2022"forestry_and_l…"removals"250620
px.bar(sdf
 .select(c_original_inventory_sector.value_counts(sort=True))
 .collect()
 .unnest(ORIGINAL_INVENTORY_SECTOR),
      x=ORIGINAL_INVENTORY_SECTOR, y="count", log_y=True)
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
File ~/.cache/pypoetry/virtualenvs/climate-tutorials-Yg9kLiuM-py3.10/lib/python3.10/site-packages/pandas/compat/_optional.py:135, in import_optional_dependency(name, extra, errors, min_version)
    134 try:
--> 135     module = importlib.import_module(name)
    136 except ImportError:

File /usr/lib/python3.10/importlib/__init__.py:126, in import_module(name, package)
    125         level += 1
--> 126 return _bootstrap._gcd_import(name[level:], package, level)

File <frozen importlib._bootstrap>:1050, in _gcd_import(name, package, level)

File <frozen importlib._bootstrap>:1027, in _find_and_load(name, import_)

File <frozen importlib._bootstrap>:1004, in _find_and_load_unlocked(name, import_)

ModuleNotFoundError: No module named 'pyarrow'

During handling of the above exception, another exception occurred:

ImportError                               Traceback (most recent call last)
File ~/.cache/pypoetry/virtualenvs/climate-tutorials-Yg9kLiuM-py3.10/lib/python3.10/site-packages/plotly/express/_core.py:1436, in build_dataframe(args, constructor)
   1435         columns = list(necessary_columns)
-> 1436         args["data_frame"] = pd.api.interchange.from_dataframe(
   1437             args["data_frame"].select_columns_by_name(columns)
   1438         )
   1439 except (ImportError, NotImplementedError) as exc:
   1440     # temporary workaround; developers of third-party libraries themselves
   1441     # should try a different implementation, if available. For example:
   1442     # def __dataframe__(self, ...):
   1443     #   if not some_condition:
   1444     #     self.to_pandas(...)

File ~/.cache/pypoetry/virtualenvs/climate-tutorials-Yg9kLiuM-py3.10/lib/python3.10/site-packages/pandas/core/interchange/from_dataframe.py:71, in from_dataframe(df, allow_copy)
     69     raise ValueError("`df` does not support __dataframe__")
---> 71 return _from_dataframe(
     72     df.__dataframe__(allow_copy=allow_copy), allow_copy=allow_copy
     73 )

File ~/.cache/pypoetry/virtualenvs/climate-tutorials-Yg9kLiuM-py3.10/lib/python3.10/site-packages/pandas/core/interchange/from_dataframe.py:94, in _from_dataframe(df, allow_copy)
     93 for chunk in df.get_chunks():
---> 94     pandas_df = protocol_df_chunk_to_pandas(chunk)
     95     pandas_dfs.append(pandas_df)

File ~/.cache/pypoetry/virtualenvs/climate-tutorials-Yg9kLiuM-py3.10/lib/python3.10/site-packages/pandas/core/interchange/from_dataframe.py:148, in protocol_df_chunk_to_pandas(df)
    147 elif dtype == DtypeKind.STRING:
--> 148     columns[name], buf = string_column_to_ndarray(col)
    149 elif dtype == DtypeKind.DATETIME:

File ~/.cache/pypoetry/virtualenvs/climate-tutorials-Yg9kLiuM-py3.10/lib/python3.10/site-packages/pandas/core/interchange/from_dataframe.py:300, in string_column_to_ndarray(col)
    299 valid_buff, valid_dtype = buffers["validity"]
--> 300 null_pos = buffer_to_ndarray(
    301     valid_buff, valid_dtype, offset=col.offset, length=col.size()
    302 )
    303 if sentinel_val == 0:

File ~/.cache/pypoetry/virtualenvs/climate-tutorials-Yg9kLiuM-py3.10/lib/python3.10/site-packages/pandas/core/interchange/from_dataframe.py:445, in buffer_to_ndarray(buffer, dtype, length, offset)
    444 assert length is not None, "`length` must be specified for a bit-mask buffer."
--> 445 pa = import_optional_dependency("pyarrow")
    446 arr = pa.BooleanArray.from_buffers(
    447     pa.bool_(),
    448     length,
    449     [None, pa.foreign_buffer(buffer.ptr, length)],
    450     offset=offset,
    451 )

File ~/.cache/pypoetry/virtualenvs/climate-tutorials-Yg9kLiuM-py3.10/lib/python3.10/site-packages/pandas/compat/_optional.py:138, in import_optional_dependency(name, extra, errors, min_version)
    137 if errors == "raise":
--> 138     raise ImportError(msg)
    139 return None

ImportError: Missing optional dependency 'pyarrow'.  Use pip or conda to install pyarrow.

During handling of the above exception, another exception occurred:

ModuleNotFoundError                       Traceback (most recent call last)
/tmp/ipykernel_1880515/786101580.py in ?()
----> 1 px.bar(sdf
      2  .select(c_original_inventory_sector.value_counts(sort=True))
      3  .collect()
      4  .unnest(ORIGINAL_INVENTORY_SECTOR),

~/.cache/pypoetry/virtualenvs/climate-tutorials-Yg9kLiuM-py3.10/lib/python3.10/site-packages/plotly/express/_chart_types.py in ?(data_frame, x, y, color, pattern_shape, facet_row, facet_col, facet_col_wrap, facet_row_spacing, facet_col_spacing, hover_name, hover_data, custom_data, text, base, error_x, error_x_minus, error_y, error_y_minus, animation_frame, animation_group, category_orders, labels, color_discrete_sequence, color_discrete_map, color_continuous_scale, pattern_shape_sequence, pattern_shape_map, range_color, color_continuous_midpoint, opacity, orientation, barmode, log_x, log_y, range_x, range_y, text_auto, title, template, width, height)
    369     """
    370     In a bar plot, each row of `data_frame` is represented as a rectangular
    371     mark.
    372     """
--> 373     return make_figure(
    374         args=locals(),
    375         constructor=go.Bar,
    376         trace_patch=dict(textposition="auto"),

~/.cache/pypoetry/virtualenvs/climate-tutorials-Yg9kLiuM-py3.10/lib/python3.10/site-packages/plotly/express/_core.py in ?(args, constructor, trace_patch, layout_patch)
   2086     trace_patch = trace_patch or {}
   2087     layout_patch = layout_patch or {}
   2088     apply_default_cascade(args)
   2089 
-> 2090     args = build_dataframe(args, constructor)
   2091     if constructor in [go.Treemap, go.Sunburst, go.Icicle] and args["path"] is not None:
   2092         args = process_dataframe_hierarchy(args)
   2093     if constructor in [go.Pie]:

~/.cache/pypoetry/virtualenvs/climate-tutorials-Yg9kLiuM-py3.10/lib/python3.10/site-packages/plotly/express/_core.py in ?(args, constructor)
   1446                 args["data_frame"] = df_not_pandas.toPandas()
   1447             elif hasattr(df_not_pandas, "to_pandas_df"):
   1448                 args["data_frame"] = df_not_pandas.to_pandas_df()
   1449             elif hasattr(df_not_pandas, "to_pandas"):
-> 1450                 args["data_frame"] = df_not_pandas.to_pandas()
   1451             else:
   1452                 raise exc
   1453 

~/.cache/pypoetry/virtualenvs/climate-tutorials-Yg9kLiuM-py3.10/lib/python3.10/site-packages/polars/dataframe/frame.py in ?(self, use_pyarrow_extension_array, **kwargs)
   2257             return self._to_pandas_with_object_columns(
   2258                 use_pyarrow_extension_array=use_pyarrow_extension_array, **kwargs
   2259             )
   2260 
-> 2261         return self._to_pandas_without_object_columns(
   2262             self, use_pyarrow_extension_array=use_pyarrow_extension_array, **kwargs
   2263         )

~/.cache/pypoetry/virtualenvs/climate-tutorials-Yg9kLiuM-py3.10/lib/python3.10/site-packages/polars/dataframe/frame.py in ?(self, df, use_pyarrow_extension_array, **kwargs)
   2308     ) -> pd.DataFrame:
   2309         if not df.width:  # Empty dataframe, cannot infer schema from batches
   2310             return pd.DataFrame()
   2311 
-> 2312         record_batches = df._df.to_pandas()
   2313         tbl = pa.Table.from_batches(record_batches)
   2314         if use_pyarrow_extension_array:
   2315             return tbl.to_pandas(

ModuleNotFoundError: No module named 'pyarrow'
px.bar(sdf.select(C("ct_file").value_counts(sort=True)).collect()
       .unnest("ct_file"),
       x="ct_file", y="count",log_y=True)
(sdf_gy
 .group_by(c_original_inventory_sector)
 .agg(c_source_id.count().alias("c"), c_emissions_quantity.sum())
 .collect()
 .sort("c", descending=True))
shape: (31, 3)
original_inventory_sectorcemissions_quantity
stru32f64
null294849-3.8651e9
"domestic-shipp…2139831.9471e8
"international-…1399575.1798e8
"wastewater-tre…534661.3067e8
"cropland-fires…479403.5394e9
"manure-left-on…462451.1210e9
"enteric-fermen…462453.2797e9
"synthetic-fert…432091.3111e9
"cement"271081.8021e9
"road-transport…169223.1400e9
"steel"109201.7520e9
"solid-waste-di…103141.0297e9
"enteric-fermen…46395.0844e7
"coal-mining"31641.4659e9
"aluminum"27603.49556024e8
"other-manufact…21152.7386e8
"oil-and-gas-pr…16478.4849e8
"oil-and-gas-re…6849.7051e8
"oil-and-gas-pr…6603.4766e9
"copper-mining"5558.3548502e7
"iron-mining"5367.1405542e7
"pulp-and-paper…3878.3640643e7
"bauxite-mining…1758.114666e6
"petrochemicals…1161.4665e8
(sdf_gy
.group_by(c_iso3_country, c_original_inventory_sector)
 .agg(c_emissions_quantity.sum())
     .sort([c_emissions_quantity], descending=True)
 .filter(c_iso3_country.is_in(["CHN", "USA", "FRA"]))
 .collect()
)
shape: (83, 3)
iso3_countryoriginal_inventory_sectoremissions_quantity
enumstrf64
"CHN""electricity-ge…4.5421e9
"USA""electricity-ge…1.4893e9
"CHN""coal-mining"1.0123e9
"CHN""steel"9.22971074e8
"USA""oil-and-gas-pr…8.5006e8
"USA""oil-and-gas-pr…8.4849e8
"USA""road-transport…8.2750e8
"CHN""cement"7.046727e8
"CHN""road-transport…4.0347e8
"CHN""rice-cultivati…3.9366e8
"CHN""synthetic-fert…2.9521e8
"CHN""chemicals"2.6414734e8
"CHN""enteric-fermen…2.6370e6
"FRA""aluminum"2.626109e6
"FRA""solid-waste-di…2.569196e6
"FRA""domestic-shipp…2.1999e6
"FRA""domestic-aviat…1.8067e6
"FRA""pulp-and-paper…938314.0
"CHN""bauxite-mining…830360.0
"FRA""water-reservoi…106100.867704
"FRA""oil-and-gas-pr…86660.769884
"FRA""rice-cultivati…0.0
"FRA"null-4.5304e7
"CHN"null-6.4086e8
px.bar(
(sdf_gy
 .filter(c_emissions_quantity > 0 )
.group_by(c_iso3_country, c_original_inventory_sector)
 .agg(c_emissions_quantity.sum())
     .sort([c_emissions_quantity], descending=True)
 .filter(c_iso3_country.is_in(["CHN", "USA", "FRA", "NLD", "IND"]))
 .collect()
)
    
    ,x=ISO3_COUNTRY, y=EMISSIONS_QUANTITY,color=ORIGINAL_INVENTORY_SECTOR,log_y=False)
px.bar(
(sdf_gy
.group_by(c_original_inventory_sector)
 .agg(c_emissions_quantity.sum())
     .sort([c_emissions_quantity], descending=True)
 .collect()
),
    
    x=ORIGINAL_INVENTORY_SECTOR,y="emissions_quantity",log_y=True)

Joint analysis#

cedf_gy_agg = (cedf_gy
    .group_by(c_iso3_country, c_ct_package, c_ct_file)
    .agg(c_emissions_quantity.sum())
              )

sdf_gy_agg = (sdf_gy
    .group_by(c_iso3_country, c_ct_package, c_ct_file)
    .agg(c_emissions_quantity.sum())
    .collect()
)

Total emissions from country inventory

(cedf_gy_agg
 .with_columns((c_emissions_quantity > 0).alias("is_emission"))
 .group_by("is_emission")
 .agg(c_emissions_quantity.sum())
 .sort("is_emission")
)
shape: (2, 2)
is_emissionemissions_quantity
boolf64
false-2.1259e10
true7.6431e10

Total emissions from source inventory

(sdf_gy_agg
 .with_columns((c_emissions_quantity > 0).alias("is_emission"))
 .group_by("is_emission")
 .agg(c_emissions_quantity.sum())
 .sort("is_emission")
)
shape: (2, 2)
is_emissionemissions_quantity
boolf64
false-5.7529e10
true9.5150e10

TODO: there is a significant discrepancy between the total reported for each report.

EMISSIONS_QUANTITY_S = EMISSIONS_QUANTITY + "_s"
EMISSIONS_QUANTITY_CE = EMISSIONS_QUANTITY + "_ce"
jdf = (cedf_gy_agg
       .join(sdf_gy_agg, on=[ISO3_COUNTRY, CT_PACKAGE, CT_FILE], suffix="_s", how="outer")
.with_columns(
    c_iso3_country.fill_null(C(ISO3_COUNTRY+"_s")),
    c_ct_package.fill_null(C(CT_PACKAGE+"_s")),
    c_ct_file.fill_null(C(CT_FILE+"_s")),
    c_emissions_quantity.alias(EMISSIONS_QUANTITY_CE),
).select(
    c_iso3_country,
    c_ct_package,
    c_ct_file,
    EMISSIONS_QUANTITY_S,
    EMISSIONS_QUANTITY_CE
))
jdf
shape: (12_916, 5)
iso3_countryct_packagect_fileemissions_quantity_semissions_quantity_ce
enumcatcatf64f64
"ABW""agriculture""enteric-fermen…null0.0
"CUW""agriculture""enteric-fermen…null0.0
"SWZ""agriculture""enteric-fermen…null47815.6
"VAT""agriculture""enteric-fermen…null0.0
"VGB""agriculture""enteric-fermen…null0.0
"IOT""agriculture""enteric-fermen…null0.0
"MAR""agriculture""enteric-fermen…null4.4698e6
"MNP""agriculture""enteric-fermen…null0.0
"ALB""agriculture""enteric-fermen…null517546.4
"BEL""agriculture""enteric-fermen…null308582.4
"COD""agriculture""enteric-fermen…null731715.6
"CYP""agriculture""enteric-fermen…null88902.8
"GIN""agriculture""synthetic-fert…4917.187885null
"HKG""agriculture""enteric-fermen…0.0null
"LIE""agriculture""enteric-fermen…0.0null
"SUR""agriculture""synthetic-fert…494.277044null
"BRN""agriculture""synthetic-fert…8130.169221null
"BFA""agriculture""synthetic-fert…3200.893827null
"BWA""agriculture""synthetic-fert…3075.866695null
"ZNC""forestry_and_l…"net-shrubgrass…-82226.761935null
"ZNC""forestry_and_l…"forest-land-cl…2417.281475null
"TLS""agriculture""synthetic-fert…8746.57223null
"TKM""agriculture""synthetic-fert…46194.154235null
"MOZ""agriculture""synthetic-fert…6825.984899null

Sanity check to ensure that no emission was lost along the way

jdf.filter((c_iso3_country == "GRC") & (c_ct_file == "electricity-generation"))
shape: (1, 5)
iso3_countryct_packagect_fileemissions_quantity_semissions_quantity_ce
enumcatcatf64f64
"GRC""power""electricity-ge…1.7217e71.8314e7
# cedf_gy.filter((c_iso3_country == "GRC") & (c_ct_file == "electricity-generation"))
list(cedf_gy.filter((c_iso3_country == "GRC") & (c_ct_package == "power")).tail(1))
[shape: (1,)
 Series: 'iso3_country' [enum]
 [
 	"GRC"
 ],
 shape: (1,)
 Series: 'start_time' [datetime[ns]]
 [
 	2022-01-01 00:00:00
 ],
 shape: (1,)
 Series: 'end_time' [datetime[ns]]
 [
 	2022-12-31 00:00:00
 ],
 shape: (1,)
 Series: 'original_inventory_sector' [str]
 [
 	"electricity-ge…
 ],
 shape: (1,)
 Series: 'gas' [enum]
 [
 	"co2e_100yr"
 ],
 shape: (1,)
 Series: 'emissions_quantity' [f64]
 [
 	1.8314e7
 ],
 shape: (1,)
 Series: 'emissions_quantity_units' [str]
 [
 	"tonnes"
 ],
 shape: (1,)
 Series: 'temporal_granularity' [enum]
 [
 	"annual"
 ],
 shape: (1,)
 Series: 'created_date' [datetime[ns]]
 [
 	2023-10-03 21:10:38.966803
 ],
 shape: (1,)
 Series: 'modified_date' [datetime[ns]]
 [
 	2023-10-31 20:30:14.210153
 ],
 shape: (1,)
 Series: 'ct_package' [cat]
 [
 	"power"
 ],
 shape: (1,)
 Series: 'ct_file' [cat]
 [
 	"electricity-generation"
 ]]
(jdf
 .with_columns(((C(EMISSIONS_QUANTITY_S) > 0) | (C(EMISSIONS_QUANTITY_CE) > 0)).alias("is_emission"))
 .group_by("is_emission")
 .agg(C(EMISSIONS_QUANTITY_S).sum(), C(EMISSIONS_QUANTITY_CE).sum()))
shape: (3, 3)
is_emissionemissions_quantity_semissions_quantity_ce
boolf64f64
true9.4461e107.6428e10
false-5.6840e10-2.1257e10
null-461722.0745120.0
jdf_cmp = (jdf
           # .drop_nulls()
           .group_by(c_ct_package, c_ct_file, c_iso3_country)
           .agg(C(EMISSIONS_QUANTITY_S).sum(), C(EMISSIONS_QUANTITY_CE).sum())
           .with_columns((C(EMISSIONS_QUANTITY_S)/C(EMISSIONS_QUANTITY_CE)).alias("found"))
)
jdf_cmp
shape: (12_916, 6)
ct_packagect_fileiso3_countryemissions_quantity_semissions_quantity_cefound
catcatenumf64f64f64
"agriculture""enteric-fermen…"CYP"0.088902.80.0
"agriculture""enteric-fermen…"ERI"0.01.090222e60.0
"agriculture""enteric-fermen…"KOR"0.0378792.40.0
"agriculture""enteric-fermen…"LTU"0.063394.80.0
"agriculture""manure-managem…"ARM"0.033541.40.0
"agriculture""manure-managem…"MWI"0.01194218.30.0
"agriculture""manure-managem…"PER"0.0831675.70.0
"agriculture""manure-managem…"PRI"0.021196.70.0
"agriculture""rice-cultivati…"JAM"0.00.0NaN
"agriculture""rice-cultivati…"POL"0.00.0NaN
"agriculture""rice-cultivati…"FLK"0.00.0NaN
"agriculture""rice-cultivati…"HMD"0.00.0NaN
"forestry_and_l…"net-wetland""ZNC"-1711.0046570.0-inf
"agriculture""synthetic-fert…"GUY"1494.0819840.0inf
"agriculture""synthetic-fert…"BLZ"3115.314780.0inf
"agriculture""synthetic-fert…"AND"17.0535870.0inf
"agriculture""enteric-fermen…"AND"0.00.0NaN
"agriculture""synthetic-fert…"NAM"807.0562010.0inf
"agriculture""synthetic-fert…"XKX"2155.8182960.0inf
"agriculture""synthetic-fert…"MCO"0.0052250.0inf
"agriculture""synthetic-fert…"PNG"24.1423330.0inf
"agriculture""synthetic-fert…"BOL"184266.1895470.0inf
"agriculture""synthetic-fert…"OMN"0.00.0NaN
"agriculture""enteric-fermen…"HKG"0.00.0NaN
px.scatter(jdf_cmp, x=EMISSIONS_QUANTITY_CE, y=EMISSIONS_QUANTITY_S,
           color=CT_PACKAGE, # Does not seem to work
           hover_name=ISO3_COUNTRY,
           hover_data=[CT_PACKAGE, CT_FILE],
          log_x=True, log_y=True)
(jdf_cmp
 # .group_by(c_original_inventory_sector)
 .select(
    C(EMISSIONS_QUANTITY_S).sum(), C(EMISSIONS_QUANTITY_CE).sum()
))
shape: (1, 2)
emissions_quantity_semissions_quantity_ce
f64f64
3.7621e105.5172e10
px.histogram(jdf_cmp, x="found",
             color=CT_PACKAGE,
             range_x=[0,3],
             nbins=5000)
# px.bar(
# sdf_gy.group_by(["conf_co2_emissions_factor", c_original_inventory_sector]).count().collect()
# ,facet_col="conf_co2_emissions_factor",y="count",facet_row=ORIGINAL_INVENTORY_SECTOR
# )

Finding consistent estimates with pymc#

subs_df = (jdf_cmp
 .filter(c_ct_file == "electricity-generation")
 .filter((C("found") > 0.1) & (C("found") < 10))
 .drop_nulls()
 .sort(by=EMISSIONS_QUANTITY_S))
px.histogram(subs_df, x=EMISSIONS_QUANTITY_CE,
             hover_name=ISO3_COUNTRY,
             # log_x=True,
             # range_x=[0,3],
             nbins=500)
subs_df
shape: (163, 6)
ct_packagect_fileiso3_countryemissions_quantity_semissions_quantity_cefound
catcatenumf64f64f64
"power""electricity-ge…"SLE"5000.016000.00.3125
"power""electricity-ge…"UGA"15000.081000.00.185185
"power""electricity-ge…"GNB"27000.067000.00.402985
"power""electricity-ge…"TCD"33000.0249000.00.13253
"power""electricity-ge…"DJI"38000.049000.00.77551
"power""electricity-ge…"NER"65000.0311000.00.209003
"power""electricity-ge…"BEN"76000.0166000.00.457831
"power""electricity-ge…"NAM"81000.089000.00.910112
"power""electricity-ge…"COD"83000.0321000.00.258567
"power""electricity-ge…"GIN"86000.0640000.00.134375
"power""electricity-ge…"GMB"108000.0255000.00.423529
"power""electricity-ge…"RWA"112000.0287000.00.390244
"power""electricity-ge…"TWN"1.59274e81.6489e80.965941
"power""electricity-ge…"ZAF"1.99288e81.99571e80.998582
"power""electricity-ge…"IRN"2.07061e82.17963e80.949982
"power""electricity-ge…"IDN"2.32013e82.46401e80.941607
"power""electricity-ge…"DEU"2.35282e82.56218e80.918288
"power""electricity-ge…"SAU"2.53227e82.6305e80.962657
"power""electricity-ge…"KOR"2.55641e82.62354e80.974412
"power""electricity-ge…"JPN"4.14062e84.39829e80.941416
"power""electricity-ge…"RUS"5.33325e85.59267e80.953614
"power""electricity-ge…"IND"1.2134e91.2486e90.971774
"power""electricity-ge…"USA"1.4893e91.4955e90.995854
"power""electricity-ge…"CHN"4.5421e94.6779e90.970958
import arviz as az
import pymc as pm
import numpy as np
logging.getLogger("matplotlib").setLevel("WARNING")
logging.getLogger("filelock").setLevel("WARNING")
DEBUG:matplotlib:matplotlib data path: /home/tjhunter/work/carbonmap/science/climate_trace/.venv/lib/python3.10/site-packages/matplotlib/mpl-data
DEBUG:matplotlib:CONFIGDIR=/home/tjhunter/.config/matplotlib
DEBUG:matplotlib:interactive is False
DEBUG:matplotlib:platform is linux
DEBUG:matplotlib:CACHEDIR=/home/tjhunter/.cache/matplotlib
DEBUG:matplotlib.font_manager:Using fontManager instance from /home/tjhunter/.cache/matplotlib/fontlist-v330.json
WARNING:pytensor.tensor.blas:Using NumPy C-API based implementation for BLAS functions.
basic_model = pm.Model()

em_s = np.array([1.4893, 2.5, 5.0])
em_ce = np.array([1.4955, 1.0, 1.0])

def shiftedGamma(mean, size):
    alpha = 8.5
    beta = 2.0
    # Putting the observed value on the mode of the distribution
    mode = (alpha-1) / beta
    # Mapping 10.m -> 0, m -> 1.0
    return mean  * (4/3. - (1.0/3) * (1.0 / mode) * pm.Gamma.dist(alpha=alpha, beta = beta, size=size))

with basic_model:
    init_est = em_ce
    epsi = pm.TruncatedNormal("epsi", mu = init_est, sigma=0.5 * init_est, lower=init_est * 0.5, upper=init_est * 10.0)
    cp = pm.CustomDist("cp", epsi, dist=shiftedGamma, observed=em_ce)
    ip = pm.CustomDist("ip", epsi, dist=shiftedGamma, observed=em_s)

# with basic_model:
#     mean = 1.0
#     alpha = 8.5
#     beta = 2.5
#     # Putting the observed value on the mode of the distribution
#     mode = (alpha-1) / beta
#     # Mapping 4.m -> 0, m -> 1.0
#     res = pm.Deterministic("pc", mean * (4/3. - (1.0/3) * (1.0 / mode) * pm.Gamma("gam", alpha=alpha, beta = beta)))
with basic_model:
    # draw 1000 posterior samples
    idata = pm.sample()
INFO:pymc.sampling.mcmc:Auto-assigning NUTS sampler...
INFO:pymc.sampling.mcmc:Initializing NUTS using jitter+adapt_diag...
INFO:pymc.sampling.mcmc:Multiprocess sampling (4 chains in 4 jobs)
INFO:pymc.sampling.mcmc:NUTS: [epsi]
100.00% [8000/8000 00:01<00:00 Sampling 4 chains, 26 divergences]
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: 1531.2619304730674.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: 2106.1051656863983.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: 3722.466882441551.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: 1305.9654349923946.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: 1038.5129616417235.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: 17536.169913009326.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: 5647.055561227237.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: 1689.8615656169666.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: 1460.0419096957983.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: 1055.972918046751.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: 5147.260470310525.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: 10529.38165208882.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: 4228.661923273439.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: 1271.6685896430693.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: 3118.8017076741685.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: 1309.8491936496227.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: 35154.4731015717.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: 3604.2682822398997.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: 1842.0076232103413.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: 1817.1840971448275.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: 1102.8433734595544.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: 1027.8079879592774.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: 2330.0343520601264.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: 2760.5998211858036.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: 2679.2401135116706.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: 1478.1124079931085.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
DEBUG:pymc.stats.convergence:Energy change in leapfrog step is too large: inf.
INFO:pymc.sampling.mcmc:Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 1 seconds.
ERROR:pymc.stats.convergence:There were 26 divergences after tuning. Increase `target_accept` or reparameterize.
az.plot_trace(idata, combined=True);
_images/ff1ecfcb4fe2d670c46d67238d46359d9fc7aff2a68dd1dc194ae5d3c9bcd8a6.png
az.summary(idata, round_to=2)
mean sd hdi_3% hdi_97% mcse_mean mcse_sd ess_bulk ess_tail r_hat
epsi[0] 1.53 0.14 1.28 1.79 0.0 0.0 2814.46 2120.88 1.0
epsi[1] 2.25 0.14 2.02 2.50 0.0 0.0 2449.41 1774.92 1.0
epsi[2] 4.13 0.13 3.91 4.38 0.0 0.0 3219.41 2628.10 1.0

Analyzing error bounds#

df= (sdf_gy
 .group_by([c_iso3_country, c_ct_file, c_conf_total_co2e_100yrgwp, c_ct_package])
 .agg(pl.count().alias("count"), c_emissions_quantity.sum())
 .collect()
 # .sort(by=[ORIGINAL_INVENTORY_SECTOR, "conf_total_co2e_100yrgwp"])
 .sort(by=c_emissions_quantity)
)
df
shape: (5_585, 6)
iso3_countryct_fileconf_total_co2e_100yrgwpct_packagecountemissions_quantity
enumcatcatcatu32f64
"BRA""removals""medium""forestry_and_l…5597-4.8601e9
"COD""removals""medium""forestry_and_l…267-4.6924e9
"AGO""removals""medium""forestry_and_l…177-3.4949e9
"MOZ""removals""medium""forestry_and_l…141-3.0537e9
"ZMB""removals""medium""forestry_and_l…126-2.7422e9
"RUS""net-forest-lan…"medium""forestry_and_l…2479-2.2479e9
"AUS""removals""medium""forestry_and_l…551-2.2324e9
"BRA""net-forest-lan…"medium""forestry_and_l…5597-1.9387e9
"RUS""removals""medium""forestry_and_l…2479-1.9316e9
"USA""removals""medium""forestry_and_l…3195-1.7630e9
"CAF""removals""medium""forestry_and_l…69-1.5976e9
"BRA""net-shrubgrass…"medium""forestry_and_l…5597-1.2465e9
"BRA""forest-land-fi…"low""forestry_and_l…23381.1465e9
"USA""forest-land-cl…"low""forestry_and_l…30041.1858e9
"IND""electricity-ge…"medium""power"4711.2134e9
"USA""electricity-ge…"medium""power"21601.4784e9
"CAF""forest-land-fi…"low""forestry_and_l…692.0777e9
"AUS""forest-land-fi…"low""forestry_and_l…2772.4736e9
"ZMB""forest-land-fi…"low""forestry_and_l…1263.1439e9
"BRA""forest-land-cl…"low""forestry_and_l…55713.1661e9
"AGO""forest-land-fi…"low""forestry_and_l…1703.3166e9
"COD""forest-land-fi…"low""forestry_and_l…2193.6139e9
"MOZ""forest-land-fi…"low""forestry_and_l…1413.7217e9
"CHN""electricity-ge…"medium""power"12254.5421e9
import scipy as sp
margins = {"very high": 0.01, "high": 0.05,"medium":0.25, "low": 0.5, "very low": 1.}
# margins = {"very high": 0.01, "high": 0.05,"medium":0.1, "low": 0.15, "very low": 0.2}

ERR_MARGIN = "err_margin"
rankings = (df.with_columns(
    (c_conf_total_co2e_100yrgwp.replace(margins, return_dtype=pl.Float32, default=margins["very low"])
      * c_emissions_quantity).alias(ERR_MARGIN)
).group_by(c_iso3_country, c_ct_file, c_ct_package)
 .agg(c_emissions_quantity.sum(), C(ERR_MARGIN).sum())
 .sort(by=ERR_MARGIN, descending=True)
 .with_row_index()
)
_plot_data = (rankings
        .filter(~(c_ct_package == "forestry_and_land_use")))
_country_order = (_plot_data
                 .group_by(c_iso3_country)
                 .agg(C(ERR_MARGIN).sum())
                 .sort(by=ERR_MARGIN, descending=True)[ISO3_COUNTRY].to_list())
px.bar(_plot_data.filter(c_iso3_country.is_in(_country_order[:10])), x=ISO3_COUNTRY,
       y=ERR_MARGIN, 
       color=CT_PACKAGE,
       hover_name=CT_FILE, 
       category_orders = {ISO3_COUNTRY: country_order},
       log_y=False)
rankings.select(c_emissions_quantity.sum(), C(ERR_MARGIN).sum())
shape: (1, 2)
emissions_quantityerr_margin
f64f64
3.7621e102.5173e10